import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('datos/datos_enero_julio.csv')
df
| Unnamed: 0 | Fecha | Hora | O1 | O2 | O3 | O4 | O5 | O6 | O7 | O8 | O9 | O10 | O11 | O12 | Selector | Airación | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2022-01-01 | 1899-12-31 01:00:00 | 0.51 | 3.21 | 2.48 | 2.71 | 3.20 | 1.95 | 3.22 | 3.10 | 3.54 | 2.61 | 2.47 | 2.38 | 2.422 | 2.752857 |
| 1 | 2 | 2022-01-01 | 1899-12-31 05:00:00 | 0.69 | 3.53 | 2.25 | 2.83 | 3.17 | 1.58 | 2.45 | 2.94 | 3.40 | 2.85 | 2.33 | 2.56 | 2.494 | 2.587143 |
| 2 | 3 | 2022-01-01 | 1899-12-31 09:00:00 | 0.78 | 3.62 | 2.03 | 2.41 | 2.35 | 1.53 | 2.39 | 2.63 | 3.07 | 2.33 | 2.08 | 2.37 | 2.238 | 2.342857 |
| 3 | 4 | 2022-01-01 | 1899-12-31 13:00:00 | 0.71 | 2.54 | 1.63 | 1.89 | 1.47 | 0.87 | 1.71 | 1.96 | 1.88 | 1.49 | 1.36 | 1.17 | 1.648 | 1.491429 |
| 4 | 5 | 2022-01-01 | 1899-12-31 17:00:00 | 0.73 | 2.14 | 1.14 | 1.37 | 1.16 | 0.58 | 1.17 | 1.86 | 1.65 | 0.95 | 0.91 | 0.84 | 1.308 | 1.137143 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1256 | 1257 | 2022-07-31 | 1899-12-31 01:00:00 | 2.40 | 4.47 | 2.43 | 2.63 | 2.29 | 1.44 | 2.35 | 2.81 | 3.14 | 2.59 | 2.62 | 2.96 | 2.844 | 2.558571 |
| 1257 | 1258 | 2022-07-31 | 1899-12-31 05:00:00 | 2.34 | 4.69 | 2.60 | 2.43 | 1.90 | 1.22 | 1.95 | 2.93 | 3.01 | 2.72 | 2.89 | 3.13 | 2.792 | 2.550000 |
| 1258 | 1259 | 2022-07-31 | 1899-12-31 09:00:00 | 2.53 | 4.89 | 2.75 | 2.50 | 2.40 | 1.33 | 2.51 | 3.01 | 3.26 | 3.21 | 3.04 | 3.21 | 3.014 | 2.795714 |
| 1259 | 1260 | 2022-07-31 | 1899-12-31 15:00:00 | 3.05 | 4.32 | 1.74 | 1.56 | 1.37 | 0.89 | 1.89 | 2.51 | 2.79 | 2.35 | 2.38 | 2.80 | 2.408 | 2.230000 |
| 1260 | 1261 | 2022-07-31 | 1899-12-31 17:00:00 | 2.32 | 0.31 | 1.42 | 1.11 | 0.78 | 0.68 | 1.59 | 2.46 | 2.67 | 2.33 | 2.38 | 2.63 | 1.188 | 2.105714 |
1261 rows × 17 columns
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 1261.0 | 631.000000 | 364.163654 | 1.000 | 316.000000 | 631.000000 | 946.000000 | 1261.000000 |
| O1 | 1261.0 | 2.153053 | 1.226441 | 0.250 | 1.170000 | 1.960000 | 2.860000 | 8.490000 |
| O2 | 1261.0 | 3.574853 | 1.181473 | 0.310 | 2.770000 | 3.410000 | 4.350000 | 8.070000 |
| O3 | 1261.0 | 3.208509 | 5.046665 | 0.230 | 2.200000 | 3.040000 | 3.920000 | 177.000000 |
| O4 | 1261.0 | 3.363688 | 1.309204 | 0.250 | 2.380000 | 3.360000 | 4.280000 | 7.000000 |
| O5 | 1261.0 | 3.089865 | 1.343925 | 0.200 | 2.090000 | 3.060000 | 4.030000 | 6.860000 |
| O6 | 1261.0 | 2.261285 | 1.141012 | 0.090 | 1.430000 | 2.160000 | 3.040000 | 7.220000 |
| O7 | 1261.0 | 2.940476 | 2.057982 | 0.250 | 2.150000 | 2.900000 | 3.680000 | 63.280000 |
| O8 | 1261.0 | 3.121427 | 1.127693 | 0.260 | 2.450000 | 3.170000 | 3.870000 | 7.680000 |
| O9 | 1261.0 | 3.065250 | 1.077779 | 0.240 | 2.400000 | 3.110000 | 3.750000 | 7.620000 |
| O10 | 1261.0 | 2.633545 | 1.067298 | 0.140 | 1.970000 | 2.590000 | 3.270000 | 7.520000 |
| O11 | 1261.0 | 2.452109 | 1.105146 | 0.170 | 1.820000 | 2.380000 | 3.040000 | 16.000000 |
| O12 | 1261.0 | 2.495908 | 1.022001 | 0.170 | 1.870000 | 2.430000 | 3.080000 | 7.320000 |
| Selector | 1261.0 | 3.076343 | 1.415870 | 0.424 | 2.322000 | 3.020000 | 3.700000 | 37.250000 |
| Airación | 1261.0 | 2.709916 | 1.062122 | 0.280 | 2.085714 | 2.702857 | 3.334286 | 11.088571 |
for colname in ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7','O8','O9','O10','O11', 'O12', 'Selector', 'Airación']:
sns.displot(df, x=colname, kde=True)
Distribution plots help outlier detection. A sensitive way to del with outliers is to trim down the data, discarding observations located in the lowest/highets percentiles. I'll keep observations smaller than a Z value of 3 - or less distant to three standard deviations from the variable mean.
df['ID']=np.arange(1,len(df)+1)
from scipy import stats
df_trim=df.iloc[:,3:][(np.abs(stats.zscore(df.iloc[:,3:])) < 3).all(axis=1)]
df_trim.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| O1 | 1232.0 | 2.096445 | 1.109235 | 0.250 | 1.160000 | 1.945 | 2.820000 | 5.670000 |
| O2 | 1232.0 | 3.534334 | 1.123165 | 0.310 | 2.750000 | 3.395 | 4.330000 | 6.860000 |
| O3 | 1232.0 | 3.031088 | 1.175490 | 0.230 | 2.162500 | 3.015 | 3.882500 | 6.310000 |
| O4 | 1232.0 | 3.334131 | 1.286144 | 0.250 | 2.367500 | 3.335 | 4.252500 | 6.920000 |
| O5 | 1232.0 | 3.062995 | 1.323110 | 0.200 | 2.077500 | 3.050 | 4.010000 | 6.860000 |
| O6 | 1232.0 | 2.204261 | 1.067393 | 0.090 | 1.417500 | 2.120 | 2.992500 | 5.470000 |
| O7 | 1232.0 | 2.841169 | 1.097014 | 0.250 | 2.140000 | 2.890 | 3.630000 | 5.570000 |
| O8 | 1232.0 | 3.072898 | 1.069664 | 0.260 | 2.440000 | 3.140 | 3.830000 | 5.680000 |
| O9 | 1232.0 | 3.017646 | 1.014465 | 0.240 | 2.390000 | 3.095 | 3.722500 | 5.850000 |
| O10 | 1232.0 | 2.582021 | 0.986003 | 0.140 | 1.950000 | 2.560 | 3.240000 | 5.320000 |
| O11 | 1232.0 | 2.391339 | 0.953600 | 0.170 | 1.800000 | 2.370 | 3.000000 | 5.210000 |
| O12 | 1232.0 | 2.445641 | 0.932615 | 0.170 | 1.860000 | 2.410 | 3.040000 | 5.280000 |
| Selector | 1232.0 | 3.010109 | 0.980161 | 0.424 | 2.308000 | 3.010 | 3.652000 | 6.068000 |
| Airación | 1232.0 | 2.650625 | 0.962480 | 0.280 | 2.070357 | 2.660 | 3.288214 | 5.228571 |
| ID | 1232.0 | 637.392857 | 363.442997 | 1.000 | 324.750000 | 635.500 | 953.250000 | 1261.000000 |
for colname in ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7','O8','O9','O10','O11', 'O12', 'Selector', 'Airación' ]:
sns.displot(df_trim, x=colname, kde=True)
cols_to_use = df_trim.columns.difference(df.columns)
df_trimmed = pd.merge(df_trim[cols_to_use], df, left_index=True, right_index=True, how='left')
Having trimmed outliers, correlation plots show strong synchronic and diachronic correlation between variables. This could affect the size and significance of OLS regression coefficients.
The next plots show the temporal correlation between the dependant variable ("Airación") and a vector of independent variables.
for colname in ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7','O8','O9','O10','O11', 'O12', 'Selector' ]:
fig, ax = plt.subplots()# Plot the first x and y axes:
df_trimmed.plot(
use_index=True,
y='Airación',
ax=ax,
x = 'Fecha',
color='orange',
figsize=(12, 3))
df_trimmed.plot(
use_index=True,
y= colname,
ax=ax,
x = 'Fecha',
secondary_y=True,
color='blue',
figsize=(12, 3))
plt.show()
Yes, there is both temporal and cross-sectional correlation between the Xs and the Y.
sns.heatmap(df_trimmed.corr(), annot = True, cmap = 'vlag')
sns.set(rc={'figure.figsize':(20,20)})
/var/folders/lc/lhrpm9yj66z82tp7q3nbgr440000gn/T/ipykernel_992/1548811932.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df_trimmed.corr(), annot = True, cmap = 'vlag')
df_trimmed[['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7',
'O8', 'O9', 'O10', 'O11', 'O12', 'Selector', 'Airación']]
| O1 | O2 | O3 | O4 | O5 | O6 | O7 | O8 | O9 | O10 | O11 | O12 | Selector | Airación | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.51 | 3.21 | 2.48 | 2.71 | 3.20 | 1.95 | 3.22 | 3.10 | 3.54 | 2.61 | 2.47 | 2.38 | 2.422 | 2.752857 |
| 1 | 0.69 | 3.53 | 2.25 | 2.83 | 3.17 | 1.58 | 2.45 | 2.94 | 3.40 | 2.85 | 2.33 | 2.56 | 2.494 | 2.587143 |
| 2 | 0.78 | 3.62 | 2.03 | 2.41 | 2.35 | 1.53 | 2.39 | 2.63 | 3.07 | 2.33 | 2.08 | 2.37 | 2.238 | 2.342857 |
| 3 | 0.71 | 2.54 | 1.63 | 1.89 | 1.47 | 0.87 | 1.71 | 1.96 | 1.88 | 1.49 | 1.36 | 1.17 | 1.648 | 1.491429 |
| 4 | 0.73 | 2.14 | 1.14 | 1.37 | 1.16 | 0.58 | 1.17 | 1.86 | 1.65 | 0.95 | 0.91 | 0.84 | 1.308 | 1.137143 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1256 | 2.40 | 4.47 | 2.43 | 2.63 | 2.29 | 1.44 | 2.35 | 2.81 | 3.14 | 2.59 | 2.62 | 2.96 | 2.844 | 2.558571 |
| 1257 | 2.34 | 4.69 | 2.60 | 2.43 | 1.90 | 1.22 | 1.95 | 2.93 | 3.01 | 2.72 | 2.89 | 3.13 | 2.792 | 2.550000 |
| 1258 | 2.53 | 4.89 | 2.75 | 2.50 | 2.40 | 1.33 | 2.51 | 3.01 | 3.26 | 3.21 | 3.04 | 3.21 | 3.014 | 2.795714 |
| 1259 | 3.05 | 4.32 | 1.74 | 1.56 | 1.37 | 0.89 | 1.89 | 2.51 | 2.79 | 2.35 | 2.38 | 2.80 | 2.408 | 2.230000 |
| 1260 | 2.32 | 0.31 | 1.42 | 1.11 | 0.78 | 0.68 | 1.59 | 2.46 | 2.67 | 2.33 | 2.38 | 2.63 | 1.188 | 2.105714 |
1232 rows × 14 columns
cols_to_use=df_trimmed[['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7',
'O8', 'O9', 'O10', 'O11', 'O12', 'Selector', 'Airación']]
sns.pairplot(cols_to_use, hue = 'Airación', palette= 'viridis_r')
<seaborn.axisgrid.PairGrid at 0x2a9661250>
However, the high correlation doesn't affect our metric of interest: R2. Hnceforth, we sholdn't exclude variables from the models.
import statsmodels.formula.api as snf
X = ' + '.join(['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7',
'O8', 'O9', 'O10', 'O11', 'O12', 'Selector'])
Y = ' Airación ~'
model_ols = snf.ols(Y + X, df_trimmed).fit()
model_ols.summary()
| Dep. Variable: | Airación | R-squared: | 1.000 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 1.000 |
| Method: | Least Squares | F-statistic: | 9.625e+06 |
| Date: | Wed, 22 Feb 2023 | Prob (F-statistic): | 0.00 |
| Time: | 13:24:49 | Log-Likelihood: | 5408.1 |
| No. Observations: | 1232 | AIC: | -1.079e+04 |
| Df Residuals: | 1218 | BIC: | -1.072e+04 |
| Df Model: | 13 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -0.0002 | 0.000 | -0.441 | 0.659 | -0.001 | 0.001 |
| O1 | -3.441e-05 | 0.001 | -0.047 | 0.962 | -0.001 | 0.001 |
| O2 | 5.592e-05 | 0.001 | 0.077 | 0.939 | -0.001 | 0.001 |
| O3 | 6.503e-05 | 0.001 | 0.084 | 0.933 | -0.001 | 0.002 |
| O4 | 0.0002 | 0.001 | 0.248 | 0.804 | -0.001 | 0.002 |
| O5 | 8.039e-05 | 0.001 | 0.103 | 0.918 | -0.001 | 0.002 |
| O6 | 0.1429 | 0.000 | 610.826 | 0.000 | 0.142 | 0.143 |
| O7 | 0.1424 | 0.000 | 379.064 | 0.000 | 0.142 | 0.143 |
| O8 | 0.1434 | 0.000 | 323.306 | 0.000 | 0.142 | 0.144 |
| O9 | 0.1427 | 0.000 | 343.382 | 0.000 | 0.142 | 0.143 |
| O10 | 0.1435 | 0.000 | 341.533 | 0.000 | 0.143 | 0.144 |
| O11 | 0.1423 | 0.000 | 296.033 | 0.000 | 0.141 | 0.143 |
| O12 | 0.1429 | 0.000 | 428.343 | 0.000 | 0.142 | 0.144 |
| Selector | -0.0004 | 0.004 | -0.115 | 0.909 | -0.007 | 0.007 |
| Omnibus: | 3477.809 | Durbin-Watson: | 2.005 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 75707168.849 |
| Skew: | -34.745 | Prob(JB): | 0.00 |
| Kurtosis: | 1215.430 | Cond. No. | 491. |
Four machine learning models will be compared based on their predictive performance using the R2 metric. The dependant variable is a continous integer that reflects the amount of oxigen coming out from a water treatment system.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn import metrics
X = df_trimmed.iloc[:,3:-2] # X
Y= df_trimmed.iloc[:,-2] # y
# Keep 25% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 4833)
search_space = {'fit_intercept': [True, False]}
modelo_OLS_CV = GridSearchCV(estimator= LinearRegression(),
param_grid= search_space,
scoring = ["r2"],
refit = "r2",
cv = 3).fit(X_train, y_train)
print(modelo_OLS_CV.best_estimator_)
LinearRegression()
regr = LinearRegression(fit_intercept = True)
regr_model = regr.fit(X_train, y_train)
y_pred_ols=regr_model.predict(X_test)
metrics.r2_score(y_pred_ols, y_test)
0.9999519612549966
search_space = {'loss': ['linear', 'square', 'exponential'],
'learning_rate': np.linspace(0.01, 1),
'n_estimators':[10,50,250]}
modelo_adaboost_CV = GridSearchCV(
estimator =AdaBoostRegressor(),
param_grid= search_space,
scoring = ["r2"],
refit = "r2",
cv = 5).fit(X_train, y_train)
modelo_adaboost_CV.best_estimator_
AdaBoostRegressor(learning_rate=0.8989795918367347, loss='square',
n_estimators=250)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. AdaBoostRegressor(learning_rate=0.8989795918367347, loss='square',
n_estimators=250)modelo_adab = AdaBoostRegressor(DecisionTreeRegressor(),
learning_rate=0.9191836734693878, loss='square',
n_estimators=250,
random_state = 45445).fit(X_train, y_train)
#Predict the response for test dataset
y_pred=modelo_adab.predict(X_test)
# R squared
print("The R squared of the Adaboost model on the testing set is:", metrics.r2_score(y_test, y_pred))
The R squared of the Adaboost model on the testing set is: 0.9855230106053857
xgb_model = XGBRegressor(random_state = 2023)
search_space = {"n_estimators": [100, 200, 500],
"max_depth": [3, 6, 12],
"gamma": [0.1, 1],
"learning_rate": [0.01,0.1,1]}
modelo_xgb_CV= GridSearchCV(
estimator= xgb_model,
param_grid= search_space,
scoring = ["r2"],
refit = "r2",
cv = 3).fit(X_train, y_train)
modelo_xgb_CV.best_estimator_
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=500, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=2023, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=500, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=2023, ...)xgb_model_selected = XGBRegressor(random_state = 2023,
gamma =0.1, learning_rate=0.01,
max_depth=6, n_estimators =500).fit(X_train, y_train)
#Predict the response for test dataset
y_pred_xgb=xgb_model_selected.predict(X_test)
# R squared
print("The R squared of the Xgboost model on the testing set is:", metrics.r2_score(y_test, y_pred_xgb))
The R squared of the Xgboost model on the testing set is: 0.9943539579981866
#gridsearch
search_space = {'random_state': [45445],
'bootstrap': [True, False],
'max_features': ( 1, 10, 50, 100),
'min_samples_split': (1, 5, 10, 20),
'n_estimators': (1, 20, 50, 100)}
modelo_randomforest_CV= GridSearchCV(
estimator= RandomForestRegressor(),
param_grid= search_space,
scoring = ["r2"],
refit = "r2",
cv = 3).fit(X_train, y_train)
modelo_randomforest_CV.best_estimator_
RandomForestRegressor(max_features=10, min_samples_split=1, random_state=45445)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_features=10, min_samples_split=1, random_state=45445)
#Predict the response for test dataset
random_forest = RandomForestRegressor(max_features=10, min_samples_split=1, random_state=45445).fit(X_train, y_train)
y_pred_randomforest=random_forest.predict(X_test)
# R squared
print("The R squared of the Random Forest model on the testing set is:", metrics.r2_score(y_test, y_pred_randomforest))
The R squared of the Random Forest model on the testing set is: 0.9901802038301838
Althoug OLS has the strongest performance, there's a very small difference between the four models. For explainability and computational suymplicity, I would recommend to implement OLS for predicting the target variable "Airación". However, the difference is so small, that any of the four models are deemed adequate for predicting the target variable.
ols = metrics.r2_score(y_pred_ols, y_test)
adaboost= metrics.r2_score(y_test, y_pred)
xgboost = metrics.r2_score(y_test, y_pred_xgb)
rforest = metrics.r2_score(y_test, y_pred_randomforest)
r2_values=[ols, adaboost, xgboost,rforest]
data = {'Models': ["ols", "adaboost", "xgb", "rforest"],
'R2': r2_values}
r2_df = pd.DataFrame(data)
r2_df
| Models | R2 | |
|---|---|---|
| 0 | ols | 0.999952 |
| 1 | adaboost | 0.985523 |
| 2 | xgb | 0.994354 |
| 3 | rforest | 0.990180 |
# setting the dimensions of the plot
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(data = r2_df, x='Models', y= 'R2', hue='R2', color ='Red', dodge= False, width= 0.5 )
plt.show()